# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import re
import requests
import socks
import socket
import translate

# use proxy
# socks.set_default_proxy(socks.SOCKS5, "127.0.0.1", 1080)
# socket.socket = socks.socksocket
#
headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
}


def parseSearchResult(data=None):
    soup = BeautifulSoup(data, "lxml")
    h3s = soup.find_all("h3", class_='condensed medium search-result-title')
    print "there are", len(h3s), "items"
    for h3 in h3s:
        h3a = h3.a
        href = h3a.get('href')
        print 'href: ', href

    for h3 in h3s:
        # TODO pass error item when error
        h3a = h3.a
        href = h3a.get('href')
        title = ' '.join(h3a.text.replace("\n", " ").split())

        # print href
        # print title

        # 解析内容
        publish_id, publish_date, summary = parseItemDetailViaUrl(href)

        # translate summary
        summary = translate.TranslateByGoogle(summary, "en")
        # TODO save data to file
        print(u"\t".join((publish_date, title, summary, publish_id, href)))

        import time
        # time.sleep(3)


def parseItemDetail(data=None):
    soup = BeautifulSoup(data, "lxml")
    date_id_string = soup.find("div", class_="new-date").text.strip()

    date_id = date_id_string.split('\n')
    # 前面可能有其它的比如 Archive 等字，所以用搜索
    # print date_id
    publish_date = re.findall(r"Published:\s*(.+)", date_id[-2])[0]
    publish_id = re.findall(r"ID:\s*(.+)", date_id[-1])[0]

    summary = soup.find("p", {'itemprop': 'description'}).text.strip()

    # print publish_date
    # print publish_id
    # print summary

    print (publish_id, publish_date, summary)
    return (publish_id, publish_date, summary)


def parseItemDetailViaUrl(url=None):
    # print "get: ", url
    r = requests.get(url, headers=headers)
    data = r.text
    return parseItemDetail(data)


if __name__ == '__main__':
    import argparse

    parser = argparse.ArgumentParser(description='Process a search result page.')
    parser.add_argument('doc', metavar='doc', type=str, nargs='+',
                        help='doc path')
    args = parser.parse_args()

    data = open(args.doc[0])
    parseSearchResult(data)
